1.a Yhat = 41.5 + (-0.12 * x1) + (0.05 * x2) + (2.87 * x3) + (-18.26 * x4) + (3.67 * x5) + ( -1.52* x6) + (0.28 * x7) + (-0.01 * x8) + (-0.93 * x9) + (-0.55 * x210)
summary(lm(medv ~ crim + zn + chas + nox + rm + dis + rad + tax + ptratio + lstat, data=Boston))
##
## Call:
## lm(formula = medv ~ crim + zn + chas + nox + rm + dis + rad +
## tax + ptratio + lstat, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.1814 -2.7625 -0.6243 1.8448 26.3920
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.451747 4.903283 8.454 3.18e-16 ***
## crim -0.121665 0.032919 -3.696 0.000244 ***
## zn 0.046191 0.013673 3.378 0.000787 ***
## chas 2.871873 0.862591 3.329 0.000935 ***
## nox -18.262427 3.565247 -5.122 4.33e-07 ***
## rm 3.672957 0.409127 8.978 < 2e-16 ***
## dis -1.515951 0.187675 -8.078 5.08e-15 ***
## rad 0.283932 0.063945 4.440 1.11e-05 ***
## tax -0.012292 0.003407 -3.608 0.000340 ***
## ptratio -0.930961 0.130423 -7.138 3.39e-12 ***
## lstat -0.546509 0.047442 -11.519 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.789 on 495 degrees of freedom
## Multiple R-squared: 0.7342, Adjusted R-squared: 0.7289
## F-statistic: 136.8 on 10 and 495 DF, p-value: < 2.2e-16
datatable1 = matrix(c(41.451747, 4.903283, 8.454, '3.18e-16', -0.121665, 0.032919, -3.696, 0.000244, 0.046191, 0.013673, 3.378, 0.000787, 2.871873, 0.862591, 3.329, 0.000935, -18.262427, 3.565247, -5.122, '4.33e-07', 3.672957, 0.409127, 8.978, '2e-16', -1.515951, 0.187675, -8.078, '5.08e-15', 0.283932, 0.063945, 4.440, '1.11e-05', -0.012292, 0.003407, -3.608, 0.000340, -0.930961, 0.130423, -7.138, '3.39e-12', -0.546509, 0.047442, -11.519, '2e-16'), ncol=4, byrow=TRUE)
colnames(datatable1) = c('Estimate', 'Std. Error', 't value', 'Pr(>|t|)')
rownames(datatable1) <- c('intercept', 'zn','chas','nox','rm' ,'dis','rad','tax','ptratio','lstat','medv')
as.table(datatable1)
## Estimate Std. Error t value Pr(>|t|)
## intercept 41.451747 4.903283 8.454 3.18e-16
## zn -0.121665 0.032919 -3.696 0.000244
## chas 0.046191 0.013673 3.378 0.000787
## nox 2.871873 0.862591 3.329 0.000935
## rm -18.262427 3.565247 -5.122 4.33e-07
## dis 3.672957 0.409127 8.978 2e-16
## rad -1.515951 0.187675 -8.078 5.08e-15
## tax 0.283932 0.063945 4.44 1.11e-05
## ptratio -0.012292 0.003407 -3.608 0.00034
## lstat -0.930961 0.130423 -7.138 3.39e-12
## medv -0.546509 0.047442 -11.519 2e-16
round(cor(Boston),1)
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## crim 1.0 -0.2 0.4 -0.1 0.4 -0.2 0.4 -0.4 0.6 0.6 0.3 0.5 -0.4
## zn -0.2 1.0 -0.5 0.0 -0.5 0.3 -0.6 0.7 -0.3 -0.3 -0.4 -0.4 0.4
## indus 0.4 -0.5 1.0 0.1 0.8 -0.4 0.6 -0.7 0.6 0.7 0.4 0.6 -0.5
## chas -0.1 0.0 0.1 1.0 0.1 0.1 0.1 -0.1 0.0 0.0 -0.1 -0.1 0.2
## nox 0.4 -0.5 0.8 0.1 1.0 -0.3 0.7 -0.8 0.6 0.7 0.2 0.6 -0.4
## rm -0.2 0.3 -0.4 0.1 -0.3 1.0 -0.2 0.2 -0.2 -0.3 -0.4 -0.6 0.7
## age 0.4 -0.6 0.6 0.1 0.7 -0.2 1.0 -0.7 0.5 0.5 0.3 0.6 -0.4
## dis -0.4 0.7 -0.7 -0.1 -0.8 0.2 -0.7 1.0 -0.5 -0.5 -0.2 -0.5 0.2
## rad 0.6 -0.3 0.6 0.0 0.6 -0.2 0.5 -0.5 1.0 0.9 0.5 0.5 -0.4
## tax 0.6 -0.3 0.7 0.0 0.7 -0.3 0.5 -0.5 0.9 1.0 0.5 0.5 -0.5
## ptratio 0.3 -0.4 0.4 -0.1 0.2 -0.4 0.3 -0.2 0.5 0.5 1.0 0.4 -0.5
## lstat 0.5 -0.4 0.6 -0.1 0.6 -0.6 0.6 -0.5 0.5 0.5 0.4 1.0 -0.7
## medv -0.4 0.4 -0.5 0.2 -0.4 0.7 -0.4 0.2 -0.4 -0.5 -0.5 -0.7 1.0
x = subset(Boston, select = - medv)
for (col in names(x)) {
plot(x = x[[col]]-1, y = Boston$medv,
xlab = col,
ylab = "medv",
main = "x vs y"
)
}
best.model = regsubsets(medv~., data = Boston, nbest=1, nvmax=12)
summary(best.model)
## Subset selection object
## Call: regsubsets.formula(medv ~ ., data = Boston, nbest = 1, nvmax = 12)
## 12 Variables (and intercept)
## Forced in Forced out
## crim FALSE FALSE
## zn FALSE FALSE
## indus FALSE FALSE
## chas FALSE FALSE
## nox FALSE FALSE
## rm FALSE FALSE
## age FALSE FALSE
## dis FALSE FALSE
## rad FALSE FALSE
## tax FALSE FALSE
## ptratio FALSE FALSE
## lstat FALSE FALSE
## 1 subsets of each size up to 12
## Selection Algorithm: exhaustive
## crim zn indus chas nox rm age dis rad tax ptratio lstat
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*"
## 2 ( 1 ) " " " " " " " " " " "*" " " " " " " " " " " "*"
## 3 ( 1 ) " " " " " " " " " " "*" " " " " " " " " "*" "*"
## 4 ( 1 ) " " " " " " " " " " "*" " " "*" " " " " "*" "*"
## 5 ( 1 ) " " " " " " " " "*" "*" " " "*" " " " " "*" "*"
## 6 ( 1 ) " " " " " " "*" "*" "*" " " "*" " " " " "*" "*"
## 7 ( 1 ) " " "*" " " "*" "*" "*" " " "*" " " " " "*" "*"
## 8 ( 1 ) "*" "*" " " "*" "*" "*" " " "*" " " " " "*" "*"
## 9 ( 1 ) "*" "*" " " " " "*" "*" " " "*" "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" " " "*" "*" "*" " " "*" "*" "*" "*" "*"
## 11 ( 1 ) "*" "*" " " "*" "*" "*" "*" "*" "*" "*" "*" "*"
## 12 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
n = dim(Boston)[1]
set.seed(1)
train_index = sample(1:n,n/2,rep=FALSE)
train = Boston[train_index,]
test = Boston[-train_index,]
val.errors = rep(NA,12)
for(i in 1:12){
test.mat = model.matrix(medv~.,data=Boston)
coef.m = coef(best.model,id=i)
pred = test.mat[,names(coef.m)]%*%coef.m
val.errors[i] = mean((Boston$medv)^2)
}
regfitt = regsubsets(medv~., data = Boston, nbest = 1, nvmax = 12)
regfitt.sum = summary(regfitt)
p = rowSums(regfitt.sum$which)
adjr2 = regfitt.sum$adjr2
cp = regfitt.sum$cp
rss = regfitt.sum$rss
AIC = n*log(rss/n) + 2*(p)
BIC = n*log(rss/n) + (p)*log(n)
cbind(rss,AIC,BIC,adjr2,cp)
## rss AIC BIC adjr2 cp
## 1 19472.38 1851.009 1859.462 0.5432418 343.848074
## 2 15439.31 1735.577 1748.256 0.6371245 170.658081
## 3 13727.99 1678.131 1695.038 0.6767036 98.320999
## 4 13228.91 1661.393 1682.526 0.6878351 78.641892
## 5 12469.34 1633.473 1658.832 0.7051702 47.647706
## 6 12141.07 1621.973 1651.559 0.7123567 35.388139
## 7 11976.67 1617.075 1650.887 0.7156820 30.246610
## 8 11805.76 1611.802 1649.841 0.7191751 24.822922
## 9 11606.40 1605.184 1647.450 0.7233609 18.162742
## 10 11352.19 1595.978 1642.470 0.7288734 9.120223
## 11 11350.50 1597.903 1648.622 0.7283649 11.046965
## 12 11349.42 1599.855 1654.800 0.7278399 13.000000
which.min(AIC)
## 10
## 10
which.min(BIC)
## 10
## 10
which.min(cp)
## [1] 10
which.max(adjr2)
## [1] 10
coef(regfitt,10)
## (Intercept) crim zn chas nox rm
## 41.45174748 -0.12166488 0.04619119 2.87187265 -18.26242664 3.67295747
## dis rad tax ptratio lstat
## -1.51595105 0.28393226 -0.01229150 -0.93096144 -0.54650916
1.b Since in class we discussed not removing any variables if possible I check for indepenancy and found it so I chose to run the best model selection with all the independent variables and all but chas had linear relationships with medv. After running the best model selection I kept the chas variable since it showed up in half of the best models.
1.c After that i check to see which model had the lowest Aic and Bic aswell as cp and r^2 and model ten was picked each time.
m10 = lm(medv ~ crim + zn + chas + nox + rm + dis + rad + tax + ptratio + lstat, data=Boston)
plot(m10)
m11 = lm(medv ~ crim + zn + chas + nox + rm + dis + rad + tax + ptratio + lstat + age, data=Boston)
plot(m11)
1.d we assume a linear relationship between the independet variable and the depedent and got it for most of them as well as check for independency aswell as checking the plots to make sure there were no patterns for normality.
1.e For this I checed multicollinearity but didnt really any varibble with too high of correlation.
2.a I would assume the the best subset model will have the smallest training MSE because it makes a model for each value from 1 to the amount of predictor and return the model with the smallest rss which will eventually get the small rss possible. Since mse = rss/n we can assume that the best selection will give the smallest MSE.
2.b Best subset will be able to go through more models and pick a lower training mse but is more prone to overfitting causing the test mse to rise how still have a better chance at finding lower test mse since it pick between more models
2.c Yes they lead me to pick the same model. The best model for AIC was 12 on both 1795991.
head(College)
## Private Apps Accept Enroll Top10perc Top25perc
## Abilene Christian University Yes 1660 1232 721 23 52
## Adelphi University Yes 2186 1924 512 16 29
## Adrian College Yes 1428 1097 336 22 50
## Agnes Scott College Yes 417 349 137 60 89
## Alaska Pacific University Yes 193 146 55 16 44
## Albertson College Yes 587 479 158 38 62
## F.Undergrad P.Undergrad Outstate Room.Board Books
## Abilene Christian University 2885 537 7440 3300 450
## Adelphi University 2683 1227 12280 6450 750
## Adrian College 1036 99 11250 3750 400
## Agnes Scott College 510 63 12960 5450 450
## Alaska Pacific University 249 869 7560 4120 800
## Albertson College 678 41 13500 3335 500
## Personal PhD Terminal S.F.Ratio perc.alumni Expend
## Abilene Christian University 2200 70 78 18.1 12 7041
## Adelphi University 1500 29 30 12.2 16 10527
## Adrian College 1165 53 66 12.9 30 8735
## Agnes Scott College 875 92 97 7.7 37 19016
## Alaska Pacific University 1500 76 72 11.9 2 10922
## Albertson College 675 67 73 9.4 11 9727
## Grad.Rate
## Abilene Christian University 60
## Adelphi University 56
## Adrian College 54
## Agnes Scott College 59
## Alaska Pacific University 15
## Albertson College 55
n = dim(College)[1]
set.seed(1)
train_i = sample(1:n,n*.9,rep=FALSE)
train = College[train_i,]
test = College[-train_i,]
regfit.fwd = regsubsets(Apps~.,data=College,nvmax=17, method="forward")
regfit.bwd = regsubsets(Apps~.,data=College,nvmax=17, method="backward")
regfit.fwd.sum = summary(regfit.fwd)
names(regfit.fwd.sum)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
n = dim(College)[1]
p = rowSums(regfit.fwd.sum$which) #number of predictors + intercept in the model
adjr2 = regfit.fwd.sum$adjr2
cp = regfit.fwd.sum$cp
rss = regfit.fwd.sum$rss
AIC = n*log(rss/n) + 2*(p)
BIC = n*log(rss/n) + (p)*log(n)
which.min(AIC)
## 12
## 12
which.min(BIC)
## 10
## 10
which.max(adjr2)
## [1] 13
which.min(cp)
## [1] 12
regfit.bwd.sum = summary(regfit.bwd)
names(regfit.bwd.sum)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
nb = dim(College)[1]
pb = rowSums(regfit.bwd.sum$which)
adjr2b = regfit.bwd.sum$adjr2
cpb = regfit.bwd.sum$cp
rssb = regfit.bwd.sum$rss
AICb = n*log(rss/nb) + 2*(pb)
BICb = n*log(rss/nb) + (pb)*log(nb)
which.min(AICb)
## 12
## 12
which.min(BICb)
## 10
## 10
which.max(adjr2b)
## [1] 13
which.min(cpb)
## [1] 12
model0 = lm(Apps~1,data=College)
summary(model0)
##
## Call:
## lm(formula = Apps ~ 1, data = College)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2921 -2226 -1444 622 45092
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3001.6 138.8 21.62 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3870 on 776 degrees of freedom
modelfull = lm(Apps~.,data=College)
summary(modelfull)
##
## Call:
## lm(formula = Apps ~ ., data = College)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4908.8 -430.2 -29.5 322.3 7852.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -445.08413 408.32855 -1.090 0.276053
## PrivateYes -494.14897 137.81191 -3.586 0.000358 ***
## Accept 1.58581 0.04074 38.924 < 2e-16 ***
## Enroll -0.88069 0.18596 -4.736 2.60e-06 ***
## Top10perc 49.92628 5.57824 8.950 < 2e-16 ***
## Top25perc -14.23448 4.47914 -3.178 0.001543 **
## F.Undergrad 0.05739 0.03271 1.754 0.079785 .
## P.Undergrad 0.04445 0.03214 1.383 0.167114
## Outstate -0.08587 0.01906 -4.506 7.64e-06 ***
## Room.Board 0.15103 0.04829 3.127 0.001832 **
## Books 0.02090 0.23841 0.088 0.930175
## Personal 0.03110 0.06308 0.493 0.622060
## PhD -8.67850 4.63814 -1.871 0.061714 .
## Terminal -3.33066 5.09494 -0.654 0.513492
## S.F.Ratio 15.38961 13.00622 1.183 0.237081
## perc.alumni 0.17867 4.10230 0.044 0.965273
## Expend 0.07790 0.01235 6.308 4.79e-10 ***
## Grad.Rate 8.66763 2.94893 2.939 0.003390 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1041 on 759 degrees of freedom
## Multiple R-squared: 0.9292, Adjusted R-squared: 0.9276
## F-statistic: 585.9 on 17 and 759 DF, p-value: < 2.2e-16
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
?stepAIC
stepAIC(model0,scope=list(lower=model0,upper=modelfull),direction="forward")
## Start: AIC=12838.69
## Apps ~ 1
##
## Df Sum of Sq RSS AIC
## + Accept 1 1.0346e+10 1.2774e+09 11125
## + Enroll 1 8.3351e+09 3.2881e+09 11860
## + F.Undergrad 1 7.7108e+09 3.9125e+09 11995
## + Private 1 2.1701e+09 9.4531e+09 12680
## + P.Undergrad 1 1.8436e+09 9.7797e+09 12706
## + PhD 1 1.7742e+09 9.8491e+09 12712
## + Terminal 1 1.5869e+09 1.0036e+10 12727
## + Top25perc 1 1.4372e+09 1.0186e+10 12738
## + Top10perc 1 1.3344e+09 1.0289e+10 12746
## + Expend 1 7.8327e+08 1.0840e+10 12786
## + Personal 1 3.7130e+08 1.1252e+10 12816
## + Room.Board 1 3.1621e+08 1.1307e+10 12819
## + Grad.Rate 1 2.5033e+08 1.1373e+10 12824
## + Books 1 2.0424e+08 1.1419e+10 12827
## + S.F.Ratio 1 1.0630e+08 1.1517e+10 12834
## + perc.alumni 1 9.4622e+07 1.1529e+10 12834
## <none> 1.1623e+10 12839
## + Outstate 1 2.9243e+07 1.1594e+10 12839
##
## Step: AIC=11124.94
## Apps ~ Accept
##
## Df Sum of Sq RSS AIC
## + Top10perc 1 298543648 978867162 10920
## + Expend 1 237832439 1039578371 10967
## + Top25perc 1 172865433 1104545378 11014
## + Grad.Rate 1 80919712 1196491098 11076
## + Room.Board 1 73480089 1203930722 11081
## + Outstate 1 64480760 1212930051 11087
## + S.F.Ratio 1 59842954 1217567857 11090
## + perc.alumni 1 43974982 1233435829 11100
## + PhD 1 40339305 1237071506 11102
## + Terminal 1 34118294 1243292517 11106
## + Enroll 1 12102492 1265308319 11120
## + Books 1 7628546 1269782265 11122
## + F.Undergrad 1 5226771 1272184040 11124
## + P.Undergrad 1 4704068 1272706743 11124
## + Private 1 3980419 1273430392 11124
## <none> 1277410811 11125
## + Personal 1 1436992 1275973818 11126
##
## Step: AIC=10920.1
## Apps ~ Accept + Top10perc
##
## Df Sum of Sq RSS AIC
## + Expend 1 29658293 949208869 10898
## + Top25perc 1 22836110 956031052 10904
## + Enroll 1 13912570 964954593 10911
## + Private 1 10668209 968198953 10914
## + PhD 1 7596359 971270804 10916
## + Room.Board 1 6159533 972707629 10917
## + Outstate 1 5782676 973084487 10918
## + Terminal 1 5767400 973099762 10918
## + perc.alumni 1 5577199 973289963 10918
## + P.Undergrad 1 2567814 976299348 10920
## <none> 978867162 10920
## + F.Undergrad 1 1718371 977148792 10921
## + Personal 1 1404273 977462889 10921
## + Books 1 1098190 977768973 10921
## + Grad.Rate 1 315353 978551809 10922
## + S.F.Ratio 1 73757 978793406 10922
##
## Step: AIC=10898.2
## Apps ~ Accept + Top10perc + Expend
##
## Df Sum of Sq RSS AIC
## + Outstate 1 34037615 915171254 10872
## + Private 1 21921323 927287546 10882
## + Top25perc 1 14720207 934488662 10888
## + PhD 1 12430477 936778392 10890
## + Terminal 1 11956757 937252112 10890
## + perc.alumni 1 11589036 937619833 10891
## + Enroll 1 8007593 941201276 10894
## + S.F.Ratio 1 7737137 941471733 10894
## + P.Undergrad 1 2918539 946290330 10898
## <none> 949208869 10898
## + Personal 1 2112443 947096426 10898
## + Books 1 637723 948571146 10900
## + Room.Board 1 264603 948944266 10900
## + F.Undergrad 1 52599 949156270 10900
## + Grad.Rate 1 1172 949207697 10900
##
## Step: AIC=10871.82
## Apps ~ Accept + Top10perc + Expend + Outstate
##
## Df Sum of Sq RSS AIC
## + Enroll 1 29010663 886160591 10849
## + Room.Board 1 16274538 898896716 10860
## + Top25perc 1 10654640 904516614 10865
## + F.Undergrad 1 8296040 906875214 10867
## + PhD 1 7700769 907470485 10867
## + Grad.Rate 1 7321591 907849663 10868
## + Terminal 1 6021988 909149266 10869
## + Private 1 3410109 911761145 10871
## <none> 915171254 10872
## + S.F.Ratio 1 2129211 913042043 10872
## + perc.alumni 1 1979846 913191408 10872
## + P.Undergrad 1 318048 914853206 10874
## + Books 1 270308 914900946 10874
## + Personal 1 51356 915119898 10874
##
## Step: AIC=10848.79
## Apps ~ Accept + Top10perc + Expend + Outstate + Enroll
##
## Df Sum of Sq RSS AIC
## + Room.Board 1 11466506 874694084 10841
## + Top25perc 1 11359437 874801154 10841
## + Private 1 9551354 876609237 10842
## + F.Undergrad 1 6287657 879872934 10845
## + PhD 1 5184017 880976573 10846
## + Grad.Rate 1 4977276 881183315 10846
## + P.Undergrad 1 4196016 881964575 10847
## + Terminal 1 3667705 882492885 10848
## + S.F.Ratio 1 3471226 882689365 10848
## <none> 886160591 10849
## + perc.alumni 1 1150617 885009974 10850
## + Personal 1 364916 885795675 10850
## + Books 1 346795 885813795 10850
##
## Step: AIC=10840.67
## Apps ~ Accept + Top10perc + Expend + Outstate + Enroll + Room.Board
##
## Df Sum of Sq RSS AIC
## + Top25perc 1 11838452 862855633 10832
## + Private 1 10278235 864415849 10834
## + PhD 1 6633253 868060832 10837
## + Terminal 1 5679065 869015019 10838
## + F.Undergrad 1 5218200 869475884 10838
## + Grad.Rate 1 3809312 870884773 10839
## + S.F.Ratio 1 3268726 871425358 10840
## + P.Undergrad 1 2771544 871922540 10840
## <none> 874694084 10841
## + Personal 1 483131 874210954 10842
## + perc.alumni 1 430235 874263849 10842
## + Books 1 32071 874662014 10843
##
## Step: AIC=10832.09
## Apps ~ Accept + Top10perc + Expend + Outstate + Enroll + Room.Board +
## Top25perc
##
## Df Sum of Sq RSS AIC
## + Private 1 11486640 851368993 10824
## + F.Undergrad 1 6829025 856026608 10828
## + Grad.Rate 1 5022152 857833480 10830
## + PhD 1 4164868 858690765 10830
## + S.F.Ratio 1 3313460 859542173 10831
## + P.Undergrad 1 3150967 859704666 10831
## + Terminal 1 2866270 859989362 10832
## <none> 862855633 10832
## + Personal 1 424835 862430798 10834
## + perc.alumni 1 111184 862744449 10834
## + Books 1 56442 862799190 10834
##
## Step: AIC=10823.67
## Apps ~ Accept + Top10perc + Expend + Outstate + Enroll + Room.Board +
## Top25perc + Private
##
## Df Sum of Sq RSS AIC
## + PhD 1 10749717 840619277 10816
## + Terminal 1 8264997 843103997 10818
## + Grad.Rate 1 6882166 844486827 10819
## + F.Undergrad 1 3793524 847575469 10822
## <none> 851368993 10824
## + P.Undergrad 1 1682974 849686020 10824
## + S.F.Ratio 1 1313916 850055078 10824
## + Personal 1 283343 851085650 10825
## + Books 1 119956 851249037 10826
## + perc.alumni 1 3141 851365852 10826
##
## Step: AIC=10815.8
## Apps ~ Accept + Top10perc + Expend + Outstate + Enroll + Room.Board +
## Top25perc + Private + PhD
##
## Df Sum of Sq RSS AIC
## + Grad.Rate 1 7349094 833270183 10811
## + F.Undergrad 1 4405096 836214181 10814
## + P.Undergrad 1 2451457 838167820 10816
## <none> 840619277 10816
## + S.F.Ratio 1 1808849 838810427 10816
## + Terminal 1 487207 840132070 10817
## + Personal 1 284996 840334280 10818
## + perc.alumni 1 78065 840541211 10818
## + Books 1 3237 840616040 10818
##
## Step: AIC=10810.98
## Apps ~ Accept + Top10perc + Expend + Outstate + Enroll + Room.Board +
## Top25perc + Private + PhD + Grad.Rate
##
## Df Sum of Sq RSS AIC
## + F.Undergrad 1 5704713 827565469 10808
## + P.Undergrad 1 4323227 828946955 10809
## <none> 833270183 10811
## + S.F.Ratio 1 1711005 831559177 10811
## + Personal 1 841830 832428353 10812
## + Terminal 1 352500 832917683 10813
## + perc.alumni 1 117245 833152938 10813
## + Books 1 58008 833212174 10813
##
## Step: AIC=10807.64
## Apps ~ Accept + Top10perc + Expend + Outstate + Enroll + Room.Board +
## Top25perc + Private + PhD + Grad.Rate + F.Undergrad
##
## Df Sum of Sq RSS AIC
## + P.Undergrad 1 2248227 825317242 10808
## <none> 827565469 10808
## + S.F.Ratio 1 1437928 826127541 10808
## + Terminal 1 515424 827050045 10809
## + Personal 1 426703 827138766 10809
## + perc.alumni 1 39675 827525794 10810
## + Books 1 25040 827540429 10810
##
## Step: AIC=10807.53
## Apps ~ Accept + Top10perc + Expend + Outstate + Enroll + Room.Board +
## Top25perc + Private + PhD + Grad.Rate + F.Undergrad + P.Undergrad
##
## Df Sum of Sq RSS AIC
## <none> 825317242 10808
## + S.F.Ratio 1 1485954 823831288 10808
## + Terminal 1 513811 824803431 10809
## + Personal 1 227696 825089547 10809
## + perc.alumni 1 20671 825296571 10810
## + Books 1 13607 825303635 10810
##
## Call:
## lm(formula = Apps ~ Accept + Top10perc + Expend + Outstate +
## Enroll + Room.Board + Top25perc + Private + PhD + Grad.Rate +
## F.Undergrad + P.Undergrad, data = College)
##
## Coefficients:
## (Intercept) Accept Top10perc Expend Outstate Enroll
## -157.28686 1.58691 50.41132 0.07247 -0.09018 -0.88265
## Room.Board Top25perc PrivateYes PhD Grad.Rate F.Undergrad
## 0.14777 -14.74735 -511.78760 -10.70503 8.63961 0.05945
## P.Undergrad
## 0.04593
stepAIC(modelfull,scope=list(lower=model0,upper=modelfull),direction="backward")
## Start: AIC=10815.4
## Apps ~ Private + Accept + Enroll + Top10perc + Top25perc + F.Undergrad +
## P.Undergrad + Outstate + Room.Board + Books + Personal +
## PhD + Terminal + S.F.Ratio + perc.alumni + Expend + Grad.Rate
##
## Df Sum of Sq RSS AIC
## - perc.alumni 1 2057 823062005 10813
## - Books 1 8332 823068280 10813
## - Personal 1 263707 823323655 10814
## - Terminal 1 463415 823523363 10814
## - S.F.Ratio 1 1518247 824578195 10815
## - P.Undergrad 1 2073704 825133652 10815
## <none> 823059948 10815
## - F.Undergrad 1 3337258 826397207 10816
## - PhD 1 3796560 826856508 10817
## - Grad.Rate 1 9368302 832428250 10822
## - Room.Board 1 10605426 833665374 10823
## - Top25perc 1 10951733 834011681 10824
## - Private 1 13942221 837002170 10826
## - Outstate 1 22020341 845080289 10834
## - Enroll 1 24321652 847381600 10836
## - Expend 1 43151679 866211628 10853
## - Top10perc 1 86866642 909926590 10891
## - Accept 1 1642984489 2466044437 11666
##
## Step: AIC=10813.4
## Apps ~ Private + Accept + Enroll + Top10perc + Top25perc + F.Undergrad +
## P.Undergrad + Outstate + Room.Board + Books + Personal +
## PhD + Terminal + S.F.Ratio + Expend + Grad.Rate
##
## Df Sum of Sq RSS AIC
## - Books 1 7989 823069994 10811
## - Personal 1 261699 823323704 10812
## - Terminal 1 461438 823523443 10812
## - S.F.Ratio 1 1517299 824579304 10813
## - P.Undergrad 1 2071755 825133760 10813
## <none> 823062005 10813
## - F.Undergrad 1 3335997 826398002 10814
## - PhD 1 3799154 826861159 10815
## - Grad.Rate 1 9830950 832892955 10821
## - Room.Board 1 10817132 833879137 10822
## - Top25perc 1 10979163 834041168 10822
## - Private 1 14029065 837091070 10824
## - Outstate 1 22841086 845903092 10833
## - Enroll 1 24505771 847567776 10834
## - Expend 1 43192465 866254470 10851
## - Top10perc 1 86934199 909996204 10889
## - Accept 1 1686807594 2509869599 11678
##
## Step: AIC=10811.41
## Apps ~ Private + Accept + Enroll + Top10perc + Top25perc + F.Undergrad +
## P.Undergrad + Outstate + Room.Board + Personal + PhD + Terminal +
## S.F.Ratio + Expend + Grad.Rate
##
## Df Sum of Sq RSS AIC
## - Personal 1 286484 823356478 10810
## - Terminal 1 453536 823523530 10810
## - S.F.Ratio 1 1523776 824593770 10811
## - P.Undergrad 1 2073781 825143774 10811
## <none> 823069994 10811
## - F.Undergrad 1 3337954 826407948 10813
## - PhD 1 3936953 827006947 10813
## - Grad.Rate 1 9823693 832893687 10819
## - Top25perc 1 10971468 834041462 10820
## - Room.Board 1 11052994 834122988 10820
## - Private 1 14021861 837091855 10822
## - Outstate 1 22934693 846004687 10831
## - Enroll 1 24507814 847577808 10832
## - Expend 1 43299794 866369788 10849
## - Top10perc 1 87145523 910215517 10888
## - Accept 1 1687626727 2510696721 11676
##
## Step: AIC=10809.68
## Apps ~ Private + Accept + Enroll + Top10perc + Top25perc + F.Undergrad +
## P.Undergrad + Outstate + Room.Board + PhD + Terminal + S.F.Ratio +
## Expend + Grad.Rate
##
## Df Sum of Sq RSS AIC
## - Terminal 1 474810 823831288 10808
## - S.F.Ratio 1 1446954 824803431 10809
## <none> 823356478 10810
## - P.Undergrad 1 2294055 825650532 10810
## - F.Undergrad 1 3524082 826880559 10811
## - PhD 1 3903516 827259993 10811
## - Grad.Rate 1 9576067 832932544 10817
## - Room.Board 1 10948047 834304524 10818
## - Top25perc 1 11009477 834365954 10818
## - Private 1 14045232 837401710 10821
## - Outstate 1 23757582 847114060 10830
## - Enroll 1 24529642 847886120 10830
## - Expend 1 43741282 867097760 10848
## - Top10perc 1 87332619 910689096 10886
## - Accept 1 1688998295 2512354773 11674
##
## Step: AIC=10808.13
## Apps ~ Private + Accept + Enroll + Top10perc + Top25perc + F.Undergrad +
## P.Undergrad + Outstate + Room.Board + PhD + S.F.Ratio + Expend +
## Grad.Rate
##
## Df Sum of Sq RSS AIC
## - S.F.Ratio 1 1485954 825317242 10808
## <none> 823831288 10808
## - P.Undergrad 1 2296253 826127541 10808
## - F.Undergrad 1 3402201 827233489 10809
## - Grad.Rate 1 9725730 833557018 10815
## - Room.Board 1 10580678 834411966 10816
## - Top25perc 1 11852393 835683681 10817
## - PhD 1 13174760 837006048 10818
## - Private 1 13675900 837507188 10819
## - Enroll 1 24420572 848251860 10829
## - Outstate 1 24881049 848712337 10829
## - Expend 1 43404484 867235772 10846
## - Top10perc 1 89940095 913771383 10887
## - Accept 1 1691950930 2515782218 11674
##
## Step: AIC=10807.53
## Apps ~ Private + Accept + Enroll + Top10perc + Top25perc + F.Undergrad +
## P.Undergrad + Outstate + Room.Board + PhD + Expend + Grad.Rate
##
## Df Sum of Sq RSS AIC
## <none> 825317242 10808
## - P.Undergrad 1 2248227 827565469 10808
## - F.Undergrad 1 3629713 828946955 10809
## - Grad.Rate 1 9850583 835167825 10815
## - Room.Board 1 10699017 836016260 10816
## - Top25perc 1 12037817 837355059 10817
## - PhD 1 12708568 838025810 10817
## - Private 1 15691081 841008323 10820
## - Enroll 1 24676722 849993965 10828
## - Outstate 1 26201946 851519188 10830
## - Expend 1 43734225 869051468 10846
## - Top10perc 1 89928332 915245574 10886
## - Accept 1 1696846612 2522163854 11674
##
## Call:
## lm(formula = Apps ~ Private + Accept + Enroll + Top10perc + Top25perc +
## F.Undergrad + P.Undergrad + Outstate + Room.Board + PhD +
## Expend + Grad.Rate, data = College)
##
## Coefficients:
## (Intercept) PrivateYes Accept Enroll Top10perc Top25perc
## -157.28686 -511.78760 1.58691 -0.88265 50.41132 -14.74735
## F.Undergrad P.Undergrad Outstate Room.Board PhD Expend
## 0.05945 0.04593 -0.09018 0.14777 -10.70503 0.07247
## Grad.Rate
## 8.63961
model_train = lm(formula = Apps ~ Private + Accept + Enroll + Top10perc + Top25perc + F.Undergrad + P.Undergrad + Outstate + Room.Board + PhD + Expend + Grad.Rate, data = College)
MSE_train = mean((train$Apps - model_train$fitted.values)^2)
## Warning in train$Apps - model_train$fitted.values: longer object length is not a
## multiple of shorter object length
MSE_train
## [1] 28530514
predicted_values = predict(model_train,test)
MSE_test = mean((test$Apps - predicted_values)^2)
MSE_test
## [1] 1795991
model_trainb = lm(formula = Apps ~ Accept + Top10perc + Expend + Outstate + Enroll + Room.Board + Top25perc + Private + PhD + Grad.Rate + F.Undergrad + P.Undergrad, data = College)
MSE_trainb = mean((train$Apps - model_trainb$fitted.values)^2)
## Warning in train$Apps - model_trainb$fitted.values: longer object length is not
## a multiple of shorter object length
MSE_trainb
## [1] 28530514
predicted_valuesb = predict(model_trainb,test)
MSE_testb = mean((test$Apps - predicted_valuesb)^2)
MSE_testb
## [1] 1795991
4.a
Credit$Own = factor(Credit$Own)
Credit$Student= factor(Credit$Student)
Credit$Married = factor(Credit$Married)
Credit$Region = factor(Credit$Region)
head(Credit)
## Income Limit Rating Cards Age Education Own Student Married Region Balance
## 1 14.891 3606 283 2 34 11 No No Yes South 333
## 2 106.025 6645 483 3 82 15 Yes Yes Yes West 903
## 3 104.593 7075 514 4 71 11 No No No West 580
## 4 148.924 9504 681 3 36 11 Yes No No West 964
## 5 55.882 4897 357 2 68 16 No No Yes South 331
## 6 80.180 8047 569 4 77 10 No No No South 1151
4.b
fit = lm(Balance~ Income+ Student,data=Credit)
summary(fit)
##
## Call:
## lm(formula = Balance ~ Income + Student, data = Credit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -762.37 -331.38 -45.04 323.60 818.28
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 211.1430 32.4572 6.505 2.34e-10 ***
## Income 5.9843 0.5566 10.751 < 2e-16 ***
## StudentYes 382.6705 65.3108 5.859 9.78e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 391.8 on 397 degrees of freedom
## Multiple R-squared: 0.2775, Adjusted R-squared: 0.2738
## F-statistic: 76.22 on 2 and 397 DF, p-value: < 2.2e-16
4.c Student: Yhat = 593.8135 + 5.9843Income Non Student: Yhat = 211.1430 + 5.9843Income
4.d for both A one unit increase in income is associated with an increase of 5.9843 units in credit, controlling for the other predictors.
4.e When looking at the plot below we can see that it doesnt really make sense the Income grows at the same rate for student and non student
library(ggplot2)
ggplot(Credit, aes(x = Income, y = Balance, color = Student)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
4.f Student: Yhat = 677.299 + 4.2188Income Non Student: Yhat = 200.6232 + 6.2182Income
summary(lm(Balance ~ Income + Student + Income:Student, data=Credit))
##
## Call:
## lm(formula = Balance ~ Income + Student + Income:Student, data = Credit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -773.39 -325.70 -41.13 321.65 814.04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 200.6232 33.6984 5.953 5.79e-09 ***
## Income 6.2182 0.5921 10.502 < 2e-16 ***
## StudentYes 476.6758 104.3512 4.568 6.59e-06 ***
## Income:StudentYes -1.9992 1.7313 -1.155 0.249
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 391.6 on 396 degrees of freedom
## Multiple R-squared: 0.2799, Adjusted R-squared: 0.2744
## F-statistic: 51.3 on 3 and 396 DF, p-value: < 2.2e-16
4.g Student: A one unit increase in income is associated with an increase of 4.2188 units in credit, controlling for the other predictors. NonStudent: A one unit increase in income is associated with an increase of 6.2182 units in credit, controlling for the other predictors.